# Load the required libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(rpart)
library(ggcorrplot)
library(corrplot)
library(caret)
library(randomForest)
library(ROCR)
library(doParallel)
library(foreach)
library(DataExplorer)
library(skimr)Predicting Article Popularity in Online News Media
Machine learning & Artificial Intelligence - Final Project
1 Data collection and Pre-Processing
1.1 Package Initializing
Import the data and view the first few rows to understand its content
# Load the data
file_path <- "/Users/chiamakaogugua/Desktop/MBAN_SMU/MBAN 5560/Final/OnlineNewsPopularity/OnlineNewsPopularity.csv"
name_path <- "/Users/chiamakaogugua/Desktop/MBAN_SMU/MBAN 5560/Final/OnlineNewsPopularity/OnlineNewsPopularity.names"
# Read the CSV file into a data frame
data <- read.csv(file_path)
# Read the names file
names <- readLines(name_path)Warning in readLines(name_path): incomplete final line found on
'/Users/chiamakaogugua/Desktop/MBAN_SMU/MBAN
5560/Final/OnlineNewsPopularity/OnlineNewsPopularity.names'
# View the first few rows of the data frame
head(data) url timedelta
1 http://mashable.com/2013/01/07/amazon-instant-video-browser/ 731
2 http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/ 731
3 http://mashable.com/2013/01/07/apple-40-billion-app-downloads/ 731
4 http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/ 731
5 http://mashable.com/2013/01/07/att-u-verse-apps/ 731
6 http://mashable.com/2013/01/07/beewi-smart-toys/ 731
n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words
1 12 219 0.6635945 1
2 9 255 0.6047431 1
3 9 211 0.5751295 1
4 9 531 0.5037879 1
5 13 1072 0.4156456 1
6 10 370 0.5598886 1
n_non_stop_unique_tokens num_hrefs num_self_hrefs num_imgs num_videos
1 0.8153846 4 2 1 0
2 0.7919463 3 1 1 0
3 0.6638655 3 1 1 0
4 0.6656347 9 0 1 0
5 0.5408895 19 19 20 0
6 0.6981982 2 2 0 0
average_token_length num_keywords data_channel_is_lifestyle
1 4.680365 5 0
2 4.913725 4 0
3 4.393365 6 0
4 4.404896 7 0
5 4.682836 7 0
6 4.359459 9 0
data_channel_is_entertainment data_channel_is_bus data_channel_is_socmed
1 1 0 0
2 0 1 0
3 0 1 0
4 1 0 0
5 0 0 0
6 0 0 0
data_channel_is_tech data_channel_is_world kw_min_min kw_max_min kw_avg_min
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
5 1 0 0 0 0
6 1 0 0 0 0
kw_min_max kw_max_max kw_avg_max kw_min_avg kw_max_avg kw_avg_avg
1 0 0 0 0 0 0
2 0 0 0 0 0 0
3 0 0 0 0 0 0
4 0 0 0 0 0 0
5 0 0 0 0 0 0
6 0 0 0 0 0 0
self_reference_min_shares self_reference_max_shares
1 496 496
2 0 0
3 918 918
4 0 0
5 545 16000
6 8500 8500
self_reference_avg_sharess weekday_is_monday weekday_is_tuesday
1 496.000 1 0
2 0.000 1 0
3 918.000 1 0
4 0.000 1 0
5 3151.158 1 0
6 8500.000 1 0
weekday_is_wednesday weekday_is_thursday weekday_is_friday
1 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0
5 0 0 0
6 0 0 0
weekday_is_saturday weekday_is_sunday is_weekend LDA_00 LDA_01
1 0 0 0 0.50033120 0.37827893
2 0 0 0 0.79975569 0.05004668
3 0 0 0 0.21779229 0.03333446
4 0 0 0 0.02857322 0.41929964
5 0 0 0 0.02863281 0.02879355
6 0 0 0 0.02224528 0.30671758
LDA_02 LDA_03 LDA_04 global_subjectivity
1 0.04000468 0.04126265 0.04012254 0.5216171
2 0.05009625 0.05010067 0.05000071 0.3412458
3 0.03335142 0.03333354 0.68218829 0.7022222
4 0.49465083 0.02890472 0.02857160 0.4298497
5 0.02857518 0.02857168 0.88542678 0.5135021
6 0.02223128 0.02222429 0.62658158 0.4374086
global_sentiment_polarity global_rate_positive_words
1 0.09256198 0.04566210
2 0.14894781 0.04313725
3 0.32333333 0.05687204
4 0.10070467 0.04143126
5 0.28100348 0.07462687
6 0.07118419 0.02972973
global_rate_negative_words rate_positive_words rate_negative_words
1 0.013698630 0.7692308 0.2307692
2 0.015686275 0.7333333 0.2666667
3 0.009478673 0.8571429 0.1428571
4 0.020715631 0.6666667 0.3333333
5 0.012126866 0.8602151 0.1397849
6 0.027027027 0.5238095 0.4761905
avg_positive_polarity min_positive_polarity max_positive_polarity
1 0.3786364 0.10000000 0.7
2 0.2869146 0.03333333 0.7
3 0.4958333 0.10000000 1.0
4 0.3859652 0.13636364 0.8
5 0.4111274 0.03333333 1.0
6 0.3506100 0.13636364 0.6
avg_negative_polarity min_negative_polarity max_negative_polarity
1 -0.3500000 -0.600 -0.2000000
2 -0.1187500 -0.125 -0.1000000
3 -0.4666667 -0.800 -0.1333333
4 -0.3696970 -0.600 -0.1666667
5 -0.2201923 -0.500 -0.0500000
6 -0.1950000 -0.400 -0.1000000
title_subjectivity title_sentiment_polarity abs_title_subjectivity
1 0.5000000 -0.1875000 0.00000000
2 0.0000000 0.0000000 0.50000000
3 0.0000000 0.0000000 0.50000000
4 0.0000000 0.0000000 0.50000000
5 0.4545455 0.1363636 0.04545455
6 0.6428571 0.2142857 0.14285714
abs_title_sentiment_polarity shares
1 0.1875000 593
2 0.0000000 711
3 0.0000000 1500
4 0.0000000 1200
5 0.1363636 505
6 0.2142857 855
Get an understanding of what each feature represents by viewing the data dictionary.
# View the names file
print(names) [1] "1. Title: Online News Popularity"
[2] ""
[3] "2. Source Information"
[4] " -- Creators: Kelwin Fernandes (kafc ‘@’ inesctec.pt, kelwinfc ’@’ gmail.com),"
[5] " Pedro Vinagre (pedro.vinagre.sousa ’@’ gmail.com) and"
[6] " Pedro Sernadela"
[7] " -- Donor: Kelwin Fernandes (kafc ’@’ inesctec.pt, kelwinfc '@' gmail.com)"
[8] " -- Date: May, 2015"
[9] ""
[10] "3. Past Usage:"
[11] " 1. K. Fernandes, P. Vinagre and P. Cortez. A Proactive Intelligent Decision"
[12] " Support System for Predicting the Popularity of Online News. Proceedings"
[13] " of the 17th EPIA 2015 - Portuguese Conference on Artificial Intelligence,"
[14] " September, Coimbra, Portugal."
[15] ""
[16] " -- Results: "
[17] " -- Binary classification as popular vs unpopular using a decision"
[18] " threshold of 1400 social interactions."
[19] " -- Experiments with different models: Random Forest (best model),"
[20] " Adaboost, SVM, KNN and Naïve Bayes."
[21] " -- Recorded 67% of accuracy and 0.73 of AUC."
[22] " - Predicted attribute: online news popularity (boolean)"
[23] ""
[24] "4. Relevant Information:"
[25] " -- The articles were published by Mashable (www.mashable.com) and their"
[26] " content as the rights to reproduce it belongs to them. Hence, this"
[27] " dataset does not share the original content but some statistics"
[28] " associated with it. The original content be publicly accessed and"
[29] " retrieved using the provided urls."
[30] " -- Acquisition date: January 8, 2015"
[31] " -- The estimated relative performance values were estimated by the authors"
[32] " using a Random Forest classifier and a rolling windows as assessment"
[33] " method. See their article for more details on how the relative"
[34] " performance values were set."
[35] ""
[36] "5. Number of Instances: 39797 "
[37] ""
[38] "6. Number of Attributes: 61 (58 predictive attributes, 2 non-predictive, "
[39] " 1 goal field)"
[40] ""
[41] "7. Attribute Information:"
[42] " 0. url: URL of the article"
[43] " 1. timedelta: Days between the article publication and"
[44] " the dataset acquisition"
[45] " 2. n_tokens_title: Number of words in the title"
[46] " 3. n_tokens_content: Number of words in the content"
[47] " 4. n_unique_tokens: Rate of unique words in the content"
[48] " 5. n_non_stop_words: Rate of non-stop words in the content"
[49] " 6. n_non_stop_unique_tokens: Rate of unique non-stop words in the"
[50] " content"
[51] " 7. num_hrefs: Number of links"
[52] " 8. num_self_hrefs: Number of links to other articles"
[53] " published by Mashable"
[54] " 9. num_imgs: Number of images"
[55] " 10. num_videos: Number of videos"
[56] " 11. average_token_length: Average length of the words in the"
[57] " content"
[58] " 12. num_keywords: Number of keywords in the metadata"
[59] " 13. data_channel_is_lifestyle: Is data channel 'Lifestyle'?"
[60] " 14. data_channel_is_entertainment: Is data channel 'Entertainment'?"
[61] " 15. data_channel_is_bus: Is data channel 'Business'?"
[62] " 16. data_channel_is_socmed: Is data channel 'Social Media'?"
[63] " 17. data_channel_is_tech: Is data channel 'Tech'?"
[64] " 18. data_channel_is_world: Is data channel 'World'?"
[65] " 19. kw_min_min: Worst keyword (min. shares)"
[66] " 20. kw_max_min: Worst keyword (max. shares)"
[67] " 21. kw_avg_min: Worst keyword (avg. shares)"
[68] " 22. kw_min_max: Best keyword (min. shares)"
[69] " 23. kw_max_max: Best keyword (max. shares)"
[70] " 24. kw_avg_max: Best keyword (avg. shares)"
[71] " 25. kw_min_avg: Avg. keyword (min. shares)"
[72] " 26. kw_max_avg: Avg. keyword (max. shares)"
[73] " 27. kw_avg_avg: Avg. keyword (avg. shares)"
[74] " 28. self_reference_min_shares: Min. shares of referenced articles in"
[75] " Mashable"
[76] " 29. self_reference_max_shares: Max. shares of referenced articles in"
[77] " Mashable"
[78] " 30. self_reference_avg_sharess: Avg. shares of referenced articles in"
[79] " Mashable"
[80] " 31. weekday_is_monday: Was the article published on a Monday?"
[81] " 32. weekday_is_tuesday: Was the article published on a Tuesday?"
[82] " 33. weekday_is_wednesday: Was the article published on a Wednesday?"
[83] " 34. weekday_is_thursday: Was the article published on a Thursday?"
[84] " 35. weekday_is_friday: Was the article published on a Friday?"
[85] " 36. weekday_is_saturday: Was the article published on a Saturday?"
[86] " 37. weekday_is_sunday: Was the article published on a Sunday?"
[87] " 38. is_weekend: Was the article published on the weekend?"
[88] " 39. LDA_00: Closeness to LDA topic 0"
[89] " 40. LDA_01: Closeness to LDA topic 1"
[90] " 41. LDA_02: Closeness to LDA topic 2"
[91] " 42. LDA_03: Closeness to LDA topic 3"
[92] " 43. LDA_04: Closeness to LDA topic 4"
[93] " 44. global_subjectivity: Text subjectivity"
[94] " 45. global_sentiment_polarity: Text sentiment polarity"
[95] " 46. global_rate_positive_words: Rate of positive words in the content"
[96] " 47. global_rate_negative_words: Rate of negative words in the content"
[97] " 48. rate_positive_words: Rate of positive words among non-neutral"
[98] " tokens"
[99] " 49. rate_negative_words: Rate of negative words among non-neutral"
[100] " tokens"
[101] " 50. avg_positive_polarity: Avg. polarity of positive words"
[102] " 51. min_positive_polarity: Min. polarity of positive words"
[103] " 52. max_positive_polarity: Max. polarity of positive words"
[104] " 53. avg_negative_polarity: Avg. polarity of negative words"
[105] " 54. min_negative_polarity: Min. polarity of negative words"
[106] " 55. max_negative_polarity: Max. polarity of negative words"
[107] " 56. title_subjectivity: Title subjectivity"
[108] " 57. title_sentiment_polarity: Title polarity"
[109] " 58. abs_title_subjectivity: Absolute subjectivity level"
[110] " 59. abs_title_sentiment_polarity: Absolute polarity level"
[111] " 60. shares: Number of shares (target)"
[112] ""
[113] "8. Missing Attribute Values: None"
[114] ""
[115] "9. Class Distribution: the class value (shares) is continuously valued. We"
[116] " transformed the task into a binary task using a decision"
[117] " threshold of 1400."
[118] ""
[119] " Shares Value Range: Number of Instances in Range:"
[120] " < 1400 18490"
[121] " >= 1400 21154"
[122] ""
[123] ""
[124] "Summary Statistics:"
[125] " Feature Min Max Mean SD"
[126] " timedelta 8.0000 731.0000 354.5305 214.1611"
[127] " n_tokens_title 2.0000 23.0000 10.3987 2.1140"
[128] " n_tokens_content 0.0000 8474.0000 546.5147 471.1016"
[129] " n_unique_tokens 0.0000 701.0000 0.5482 3.5207"
[130] " n_non_stop_words 0.0000 1042.0000 0.9965 5.2312"
[131] " n_non_stop_unique_tokens 0.0000 650.0000 0.6892 3.2648"
[132] " num_hrefs 0.0000 304.0000 10.8837 11.3319"
[133] " num_self_hrefs 0.0000 116.0000 3.2936 3.8551"
[134] " num_imgs 0.0000 128.0000 4.5441 8.3093"
[135] " num_videos 0.0000 91.0000 1.2499 4.1078"
[136] " average_token_length 0.0000 8.0415 4.5482 0.8444"
[137] " num_keywords 1.0000 10.0000 7.2238 1.9091"
[138] " data_channel_is_lifestyle 0.0000 1.0000 0.0529 0.2239"
[139] " data_channel_is_entertainment 0.0000 1.0000 0.1780 0.3825"
[140] " data_channel_is_bus 0.0000 1.0000 0.1579 0.3646"
[141] " data_channel_is_socmed 0.0000 1.0000 0.0586 0.2349"
[142] " data_channel_is_tech 0.0000 1.0000 0.1853 0.3885"
[143] " data_channel_is_world 0.0000 1.0000 0.2126 0.4091"
[144] " kw_min_min -1.0000 377.0000 26.1068 69.6323"
[145] " kw_max_min 0.0000 298400.0000 1153.9517 3857.9422"
[146] " kw_avg_min -1.0000 42827.8571 312.3670 620.7761"
[147] " kw_min_max 0.0000 843300.0000 13612.3541 57985.2980"
[148] " kw_max_max 0.0000 843300.0000 752324.0667 214499.4242"
[149] " kw_avg_max 0.0000 843300.0000 259281.9381 135100.5433"
[150] " kw_min_avg -1.0000 3613.0398 1117.1466 1137.4426"
[151] " kw_max_avg 0.0000 298400.0000 5657.2112 6098.7950"
[152] " kw_avg_avg 0.0000 43567.6599 3135.8586 1318.1338"
[153] " self_reference_min_shares 0.0000 843300.0000 3998.7554 19738.4216"
[154] " self_reference_max_shares 0.0000 843300.0000 10329.2127 41027.0592"
[155] " self_reference_avg_sharess 0.0000 843300.0000 6401.6976 24211.0269"
[156] " weekday_is_monday 0.0000 1.0000 0.1680 0.3739"
[157] " weekday_is_tuesday 0.0000 1.0000 0.1864 0.3894"
[158] " weekday_is_wednesday 0.0000 1.0000 0.1875 0.3903"
[159] " weekday_is_thursday 0.0000 1.0000 0.1833 0.3869"
[160] " weekday_is_friday 0.0000 1.0000 0.1438 0.3509"
[161] " weekday_is_saturday 0.0000 1.0000 0.0619 0.2409"
[162] " weekday_is_sunday 0.0000 1.0000 0.0690 0.2535"
[163] " is_weekend 0.0000 1.0000 0.1309 0.3373"
[164] " LDA_00 0.0000 0.9270 0.1846 0.2630"
[165] " LDA_01 0.0000 0.9259 0.1413 0.2197"
[166] " LDA_02 0.0000 0.9200 0.2163 0.2821"
[167] " LDA_03 0.0000 0.9265 0.2238 0.2952"
[168] " LDA_04 0.0000 0.9272 0.2340 0.2892"
[169] " global_subjectivity 0.0000 1.0000 0.4434 0.1167"
[170] " global_sentiment_polarity -0.3937 0.7278 0.1193 0.0969"
[171] " global_rate_positive_words 0.0000 0.1555 0.0396 0.0174"
[172] " global_rate_negative_words 0.0000 0.1849 0.0166 0.0108"
[173] " rate_positive_words 0.0000 1.0000 0.6822 0.1902"
[174] " rate_negative_words 0.0000 1.0000 0.2879 0.1562"
[175] " avg_positive_polarity 0.0000 1.0000 0.3538 0.1045"
[176] " min_positive_polarity 0.0000 1.0000 0.0954 0.0713"
[177] " max_positive_polarity 0.0000 1.0000 0.7567 0.2478"
[178] " avg_negative_polarity -1.0000 0.0000 -0.2595 0.1277"
[179] " min_negative_polarity -1.0000 0.0000 -0.5219 0.2903"
[180] " max_negative_polarity -1.0000 0.0000 -0.1075 0.0954"
[181] " title_subjectivity 0.0000 1.0000 0.2824 0.3242"
[182] " title_sentiment_polarity -1.0000 1.0000 0.0714 0.2654"
[183] " abs_title_subjectivity 0.0000 0.5000 0.3418 0.1888"
[184] " abs_title_sentiment_polarity 0.0000 1.0000 0.1561 0.2263"
[185] ""
[186] " "
[187] " Citation Request:"
[188] " "
[189] " Please include this citation if you plan to use this database: "
[190] " "
[191] " K. Fernandes, P. Vinagre and P. Cortez. A Proactive Intelligent Decision"
[192] " Support System for Predicting the Popularity of Online News. Proceedings"
[193] " of the 17th EPIA 2015 - Portuguese Conference on Artificial Intelligence,"
[194] " September, Coimbra, Portugal."
# Get a summary of the data
skim(data)| Name | data |
| Number of rows | 39644 |
| Number of columns | 61 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 60 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| url | 0 | 1 | 34 | 192 | 0 | 39644 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| timedelta | 0 | 1 | 354.53 | 214.16 | 8.00 | 164.00 | 339.00 | 542.00 | 731.00 | ▇▇▆▆▇ |
| n_tokens_title | 0 | 1 | 10.40 | 2.11 | 2.00 | 9.00 | 10.00 | 12.00 | 23.00 | ▁▇▇▁▁ |
| n_tokens_content | 0 | 1 | 546.51 | 471.11 | 0.00 | 246.00 | 409.00 | 716.00 | 8474.00 | ▇▁▁▁▁ |
| n_unique_tokens | 0 | 1 | 0.55 | 3.52 | 0.00 | 0.47 | 0.54 | 0.61 | 701.00 | ▇▁▁▁▁ |
| n_non_stop_words | 0 | 1 | 1.00 | 5.23 | 0.00 | 1.00 | 1.00 | 1.00 | 1042.00 | ▇▁▁▁▁ |
| n_non_stop_unique_tokens | 0 | 1 | 0.69 | 3.26 | 0.00 | 0.63 | 0.69 | 0.75 | 650.00 | ▇▁▁▁▁ |
| num_hrefs | 0 | 1 | 10.88 | 11.33 | 0.00 | 4.00 | 8.00 | 14.00 | 304.00 | ▇▁▁▁▁ |
| num_self_hrefs | 0 | 1 | 3.29 | 3.86 | 0.00 | 1.00 | 3.00 | 4.00 | 116.00 | ▇▁▁▁▁ |
| num_imgs | 0 | 1 | 4.54 | 8.31 | 0.00 | 1.00 | 1.00 | 4.00 | 128.00 | ▇▁▁▁▁ |
| num_videos | 0 | 1 | 1.25 | 4.11 | 0.00 | 0.00 | 0.00 | 1.00 | 91.00 | ▇▁▁▁▁ |
| average_token_length | 0 | 1 | 4.55 | 0.84 | 0.00 | 4.48 | 4.66 | 4.85 | 8.04 | ▁▁▇▃▁ |
| num_keywords | 0 | 1 | 7.22 | 1.91 | 1.00 | 6.00 | 7.00 | 9.00 | 10.00 | ▁▂▇▇▇ |
| data_channel_is_lifestyle | 0 | 1 | 0.05 | 0.22 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| data_channel_is_entertainment | 0 | 1 | 0.18 | 0.38 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| data_channel_is_bus | 0 | 1 | 0.16 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| data_channel_is_socmed | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| data_channel_is_tech | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| data_channel_is_world | 0 | 1 | 0.21 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| kw_min_min | 0 | 1 | 26.11 | 69.63 | -1.00 | -1.00 | -1.00 | 4.00 | 377.00 | ▇▁▁▁▁ |
| kw_max_min | 0 | 1 | 1153.95 | 3857.99 | 0.00 | 445.00 | 660.00 | 1000.00 | 298400.00 | ▇▁▁▁▁ |
| kw_avg_min | 0 | 1 | 312.37 | 620.78 | -1.00 | 141.75 | 235.50 | 357.00 | 42827.86 | ▇▁▁▁▁ |
| kw_min_max | 0 | 1 | 13612.35 | 57986.03 | 0.00 | 0.00 | 1400.00 | 7900.00 | 843300.00 | ▇▁▁▁▁ |
| kw_max_max | 0 | 1 | 752324.07 | 214502.13 | 0.00 | 843300.00 | 843300.00 | 843300.00 | 843300.00 | ▁▁▁▁▇ |
| kw_avg_max | 0 | 1 | 259281.94 | 135102.25 | 0.00 | 172846.88 | 244572.22 | 330980.00 | 843300.00 | ▃▇▃▁▁ |
| kw_min_avg | 0 | 1 | 1117.15 | 1137.46 | -1.00 | 0.00 | 1023.64 | 2056.78 | 3613.04 | ▇▃▃▂▂ |
| kw_max_avg | 0 | 1 | 5657.21 | 6098.87 | 0.00 | 3562.10 | 4355.69 | 6019.95 | 298400.00 | ▇▁▁▁▁ |
| kw_avg_avg | 0 | 1 | 3135.86 | 1318.15 | 0.00 | 2382.45 | 2870.07 | 3600.23 | 43567.66 | ▇▁▁▁▁ |
| self_reference_min_shares | 0 | 1 | 3998.76 | 19738.67 | 0.00 | 639.00 | 1200.00 | 2600.00 | 843300.00 | ▇▁▁▁▁ |
| self_reference_max_shares | 0 | 1 | 10329.21 | 41027.58 | 0.00 | 1100.00 | 2800.00 | 8000.00 | 843300.00 | ▇▁▁▁▁ |
| self_reference_avg_sharess | 0 | 1 | 6401.70 | 24211.33 | 0.00 | 981.19 | 2200.00 | 5200.00 | 843300.00 | ▇▁▁▁▁ |
| weekday_is_monday | 0 | 1 | 0.17 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| weekday_is_tuesday | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| weekday_is_wednesday | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| weekday_is_thursday | 0 | 1 | 0.18 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| weekday_is_friday | 0 | 1 | 0.14 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| weekday_is_saturday | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| weekday_is_sunday | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| is_weekend | 0 | 1 | 0.13 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| LDA_00 | 0 | 1 | 0.18 | 0.26 | 0.00 | 0.03 | 0.03 | 0.24 | 0.93 | ▇▁▁▁▁ |
| LDA_01 | 0 | 1 | 0.14 | 0.22 | 0.00 | 0.03 | 0.03 | 0.15 | 0.93 | ▇▁▁▁▁ |
| LDA_02 | 0 | 1 | 0.22 | 0.28 | 0.00 | 0.03 | 0.04 | 0.33 | 0.92 | ▇▁▁▁▁ |
| LDA_03 | 0 | 1 | 0.22 | 0.30 | 0.00 | 0.03 | 0.04 | 0.38 | 0.93 | ▇▁▁▁▂ |
| LDA_04 | 0 | 1 | 0.23 | 0.29 | 0.00 | 0.03 | 0.04 | 0.40 | 0.93 | ▇▂▁▁▂ |
| global_subjectivity | 0 | 1 | 0.44 | 0.12 | 0.00 | 0.40 | 0.45 | 0.51 | 1.00 | ▁▃▇▁▁ |
| global_sentiment_polarity | 0 | 1 | 0.12 | 0.10 | -0.39 | 0.06 | 0.12 | 0.18 | 0.73 | ▁▂▇▁▁ |
| global_rate_positive_words | 0 | 1 | 0.04 | 0.02 | 0.00 | 0.03 | 0.04 | 0.05 | 0.16 | ▅▇▁▁▁ |
| global_rate_negative_words | 0 | 1 | 0.02 | 0.01 | 0.00 | 0.01 | 0.02 | 0.02 | 0.18 | ▇▁▁▁▁ |
| rate_positive_words | 0 | 1 | 0.68 | 0.19 | 0.00 | 0.60 | 0.71 | 0.80 | 1.00 | ▁▁▃▇▃ |
| rate_negative_words | 0 | 1 | 0.29 | 0.16 | 0.00 | 0.19 | 0.28 | 0.38 | 1.00 | ▅▇▃▁▁ |
| avg_positive_polarity | 0 | 1 | 0.35 | 0.10 | 0.00 | 0.31 | 0.36 | 0.41 | 1.00 | ▁▇▃▁▁ |
| min_positive_polarity | 0 | 1 | 0.10 | 0.07 | 0.00 | 0.05 | 0.10 | 0.10 | 1.00 | ▇▁▁▁▁ |
| max_positive_polarity | 0 | 1 | 0.76 | 0.25 | 0.00 | 0.60 | 0.80 | 1.00 | 1.00 | ▁▁▅▅▇ |
| avg_negative_polarity | 0 | 1 | -0.26 | 0.13 | -1.00 | -0.33 | -0.25 | -0.19 | 0.00 | ▁▁▂▇▃ |
| min_negative_polarity | 0 | 1 | -0.52 | 0.29 | -1.00 | -0.70 | -0.50 | -0.30 | 0.00 | ▆▆▇▅▅ |
| max_negative_polarity | 0 | 1 | -0.11 | 0.10 | -1.00 | -0.12 | -0.10 | -0.05 | 0.00 | ▁▁▁▁▇ |
| title_subjectivity | 0 | 1 | 0.28 | 0.32 | 0.00 | 0.00 | 0.15 | 0.50 | 1.00 | ▇▂▂▁▂ |
| title_sentiment_polarity | 0 | 1 | 0.07 | 0.27 | -1.00 | 0.00 | 0.00 | 0.15 | 1.00 | ▁▁▇▂▁ |
| abs_title_subjectivity | 0 | 1 | 0.34 | 0.19 | 0.00 | 0.17 | 0.50 | 0.50 | 0.50 | ▃▂▁▁▇ |
| abs_title_sentiment_polarity | 0 | 1 | 0.16 | 0.23 | 0.00 | 0.00 | 0.00 | 0.25 | 1.00 | ▇▂▁▁▁ |
| shares | 0 | 1 | 3395.38 | 11626.95 | 1.00 | 946.00 | 1400.00 | 2800.00 | 843300.00 | ▇▁▁▁▁ |
introduce(data) rows columns discrete_columns continuous_columns all_missing_columns
1 39644 61 1 60 0
total_missing_values complete_rows total_observations memory_usage
1 0 39644 2418284 24069448
# Check how many unique values are in each column
unique_values <- sapply(data, function(x) length(unique(x)))
# Show the unique values as a data frame
unique_values_df <- data.frame(unique_values)
print(unique_values_df) unique_values
url 39644
timedelta 724
n_tokens_title 20
n_tokens_content 2406
n_unique_tokens 27281
n_non_stop_words 1451
n_non_stop_unique_tokens 22930
num_hrefs 133
num_self_hrefs 59
num_imgs 91
num_videos 53
average_token_length 30136
num_keywords 10
data_channel_is_lifestyle 2
data_channel_is_entertainment 2
data_channel_is_bus 2
data_channel_is_socmed 2
data_channel_is_tech 2
data_channel_is_world 2
kw_min_min 26
kw_max_min 1076
kw_avg_min 17003
kw_min_max 1021
kw_max_max 35
kw_avg_max 30834
kw_min_avg 15982
kw_max_avg 19438
kw_avg_avg 39300
self_reference_min_shares 1255
self_reference_max_shares 1137
self_reference_avg_sharess 8626
weekday_is_monday 2
weekday_is_tuesday 2
weekday_is_wednesday 2
weekday_is_thursday 2
weekday_is_friday 2
weekday_is_saturday 2
weekday_is_sunday 2
is_weekend 2
LDA_00 39337
LDA_01 39098
LDA_02 39525
LDA_03 38963
LDA_04 39370
global_subjectivity 34501
global_sentiment_polarity 34695
global_rate_positive_words 13159
global_rate_negative_words 10271
rate_positive_words 2284
rate_negative_words 2284
avg_positive_polarity 27301
min_positive_polarity 33
max_positive_polarity 38
avg_negative_polarity 13841
min_negative_polarity 54
max_negative_polarity 49
title_subjectivity 673
title_sentiment_polarity 813
abs_title_subjectivity 532
abs_title_sentiment_polarity 653
shares 1454
# convert variables to factors if they contain only 2 unique values(0 and 1)
data <- data %>%
mutate_if(~length(unique(.)) == 2, as.factor)Handle Outliers
# Create a dataframe to store the 1 percentile and 99 percentile values of each numeric variable
percentile_df <- data.frame(
Variable = character(),
P1 = numeric(),
P99 = numeric(),
stringsAsFactors = FALSE
)
# Calculate the 1 percentile and 99 percentile values for each numeric variable except the target variable
for (col in names(data)) {
if (is.numeric(data[[col]]) && col != "shares") {
p1 <- quantile(data[[col]], probs = 0.01)
p99 <- quantile(data[[col]], probs = 0.99)
percentile_df <- rbind(percentile_df, data.frame(Variable = col, P1 = p1, P99 = p99))
}
}
# Trim the dataset at the 1 percentile and 99 percentile values
for (i in 1:nrow(percentile_df)) {
col <- percentile_df$Variable[i]
p1 <- percentile_df$P1[i]
p99 <- percentile_df$P99[i]
data <- data[data[[col]] >= p1 & data[[col]] <= p99, ]
}
rownames(percentile_df) <- NULL# Summary statistics of the data as a data frame
summary_df <- data.frame(
Variable = character(),
Quantile_1 = numeric(),
Mean = numeric(),
Median = numeric(),
Min = numeric(),
Max = numeric(),
SD = numeric(),
Quantile_3 = numeric(),
stringsAsFactors = FALSE
)
# Store the summary statistics of each numeric variable in the data frame
for (col in names(data)) {
if (is.numeric(data[[col]])) {
quantile_1 <- quantile(data[[col]], probs = 0.25)
mean_val <- mean(data[[col]], na.rm = TRUE)
median_val <- median(data[[col]], na.rm = TRUE)
min_val <- min(data[[col]], na.rm = TRUE)
max_val <- max(data[[col]], na.rm = TRUE)
sd_val <- sd(data[[col]], na.rm = TRUE)
quantile_3 <- quantile(data[[col]], probs = 0.75)
summary_df <- rbind(summary_df, data.frame(Variable = col,
Quantile_1 = quantile_1,
Mean = mean_val,
Median = median_val,
Min = min_val,
Max = max_val,
SD = sd_val,
Quantile_3 = quantile_3))
}
}
rownames(summary_df) <- NULL# Display the summary statistics of the data
print(summary_df) Variable Quantile_1 Mean Median
1 timedelta 1.630000e+02 3.531640e+02 3.380000e+02
2 n_tokens_title 9.000000e+00 1.037154e+01 1.000000e+01
3 n_tokens_content 2.610000e+02 5.228292e+02 4.180000e+02
4 n_unique_tokens 4.723772e-01 5.260999e-01 5.375723e-01
5 n_non_stop_words 1.000000e+00 9.684708e-01 1.000000e+00
6 n_non_stop_unique_tokens 6.315789e-01 6.729826e-01 6.918605e-01
7 num_hrefs 4.000000e+00 9.844100e+00 7.000000e+00
8 num_self_hrefs 1.000000e+00 3.010681e+00 2.000000e+00
9 num_imgs 1.000000e+00 3.709390e+00 1.000000e+00
10 num_videos 0.000000e+00 9.092808e-01 0.000000e+00
11 average_token_length 4.476955e+00 4.528637e+00 4.659884e+00
12 num_keywords 6.000000e+00 7.100065e+00 7.000000e+00
13 kw_min_min -1.000000e+00 2.432830e+01 -1.000000e+00
14 kw_max_min 4.450000e+02 9.002063e+02 6.530000e+02
15 kw_avg_min 1.415278e+02 2.729934e+02 2.320000e+02
16 kw_min_max 0.000000e+00 8.199812e+03 1.300000e+03
17 kw_max_max 8.433000e+05 7.611773e+05 8.433000e+05
18 kw_avg_max 1.743929e+05 2.551630e+05 2.428750e+05
19 kw_min_avg 0.000000e+00 1.062847e+03 9.890000e+02
20 kw_max_avg 3.535056e+03 5.011222e+03 4.199946e+03
21 kw_avg_avg 2.365082e+03 2.985165e+03 2.806850e+03
22 self_reference_min_shares 6.470000e+02 2.789009e+03 1.200000e+03
23 self_reference_max_shares 1.000000e+03 6.546095e+03 2.700000e+03
24 self_reference_avg_sharess 9.590000e+02 4.311845e+03 2.120000e+03
25 LDA_00 2.571757e-02 1.901247e-01 3.369594e-02
26 LDA_01 2.505856e-02 1.347287e-01 3.334509e-02
27 LDA_02 2.857354e-02 2.276418e-01 4.009573e-02
28 LDA_03 2.857161e-02 1.957457e-01 4.000011e-02
29 LDA_04 2.897078e-02 2.517590e-01 5.000347e-02
30 global_subjectivity 3.924948e-01 4.361888e-01 4.492643e-01
31 global_sentiment_polarity 6.127906e-02 1.185351e-01 1.183382e-01
32 global_rate_positive_words 2.877698e-02 3.910349e-02 3.887689e-02
33 global_rate_negative_words 9.637264e-03 1.598469e-02 1.515152e-02
34 rate_positive_words 6.065574e-01 6.861195e-01 7.142857e-01
35 rate_negative_words 1.875000e-01 2.822828e-01 2.777778e-01
36 avg_positive_polarity 3.028827e-01 3.458340e-01 3.537252e-01
37 min_positive_polarity 5.000000e-02 8.960845e-02 1.000000e-01
38 max_positive_polarity 6.000000e-01 7.493520e-01 8.000000e-01
39 avg_negative_polarity -3.183918e-01 -2.501789e-01 -2.492857e-01
40 min_negative_polarity -7.000000e-01 -5.096009e-01 -5.000000e-01
41 max_negative_polarity -1.250000e-01 -1.016809e-01 -1.000000e-01
42 title_subjectivity 0.000000e+00 2.661779e-01 1.000000e-01
43 title_sentiment_polarity 0.000000e+00 7.581188e-02 0.000000e+00
44 abs_title_subjectivity 1.666667e-01 3.436643e-01 5.000000e-01
45 abs_title_sentiment_polarity 0.000000e+00 1.416363e-01 0.000000e+00
46 shares 9.420000e+02 3.164150e+03 1.400000e+03
Min Max SD Quantile_3
1 1.700000e+01 7.240000e+02 2.112737e+02 5.410000e+02
2 6.000000e+00 1.500000e+01 1.994933e+00 1.200000e+01
3 0.000000e+00 2.253000e+03 3.721950e+02 6.970000e+02
4 0.000000e+00 8.023256e-01 1.306856e-01 6.016540e-01
5 0.000000e+00 1.000000e+00 1.747460e-01 1.000000e+00
6 0.000000e+00 9.206349e-01 1.500215e-01 7.520661e-01
7 0.000000e+00 5.600000e+01 8.315608e+00 1.300000e+01
8 0.000000e+00 2.000000e+01 2.672329e+00 4.000000e+00
9 0.000000e+00 3.700000e+01 5.941101e+00 3.000000e+00
10 0.000000e+00 2.100000e+01 2.619062e+00 1.000000e+00
11 0.000000e+00 5.443532e+00 8.563799e-01 4.844262e+00
12 3.000000e+00 1.000000e+01 1.819730e+00 8.000000e+00
13 -1.000000e+00 2.170000e+02 6.744013e+01 4.000000e+00
14 0.000000e+00 1.040000e+04 9.547208e+02 1.000000e+03
15 -1.000000e+00 1.700833e+03 2.032824e+02 3.512361e+02
16 0.000000e+00 2.083000e+05 1.940463e+04 7.000000e+03
17 3.740000e+04 8.433000e+05 1.982135e+05 8.433000e+05
18 1.306000e+04 6.358465e+05 1.224144e+05 3.254938e+05
19 0.000000e+00 3.511757e+03 1.089737e+03 1.975000e+03
20 2.561218e+03 2.380000e+04 2.252194e+03 5.773119e+03
21 1.340440e+03 6.991623e+03 8.882966e+02 3.427088e+03
22 0.000000e+00 5.310000e+04 5.365925e+03 2.500000e+03
23 0.000000e+00 9.870000e+04 1.124966e+04 7.000000e+03
24 0.000000e+00 6.660000e+04 6.684641e+03 4.800000e+03
25 2.000055e-02 8.999490e-01 2.614731e-01 2.700108e-01
26 2.000083e-02 8.855888e-01 2.086045e-01 1.461843e-01
27 2.000045e-02 9.109854e-01 2.834169e-01 3.683428e-01
28 2.000028e-02 9.108017e-01 2.726876e-01 2.722143e-01
29 2.000050e-02 9.194993e-01 2.926953e-01 4.492615e-01
30 0.000000e+00 6.850000e-01 1.119651e-01 5.011580e-01
31 -1.119835e-01 3.729592e-01 8.318794e-02 1.738709e-01
32 0.000000e+00 8.514851e-02 1.600481e-02 4.950495e-02
33 0.000000e+00 5.232558e-02 9.344544e-03 2.131881e-02
34 0.000000e+00 1.000000e+00 1.826755e-01 8.000000e-01
35 0.000000e+00 7.058824e-01 1.435604e-01 3.750000e-01
36 0.000000e+00 6.000000e-01 9.703091e-02 4.030303e-01
37 0.000000e+00 4.000000e-01 5.510139e-02 1.000000e-01
38 0.000000e+00 1.000000e+00 2.455111e-01 1.000000e+00
39 -6.125000e-01 0.000000e+00 1.131494e-01 -1.833333e-01
40 -1.000000e+00 0.000000e+00 2.830645e-01 -3.000000e-01
41 -5.000000e-01 0.000000e+00 7.430980e-02 -5.000000e-02
42 0.000000e+00 1.000000e+00 3.144359e-01 5.000000e-01
43 -7.000000e-01 1.000000e+00 2.404664e-01 1.363636e-01
44 0.000000e+00 5.000000e-01 1.882429e-01 5.000000e-01
45 0.000000e+00 1.000000e+00 2.085910e-01 2.181818e-01
46 5.000000e+00 8.433000e+05 1.017555e+04 2.700000e+03
glimpse(data)Rows: 29,211
Columns: 61
$ url <chr> "http://mashable.com/2013/01/14/aaron-sw…
$ timedelta <dbl> 724, 724, 724, 724, 724, 724, 724, 724, …
$ n_tokens_title <dbl> 11, 9, 10, 12, 11, 13, 8, 14, 8, 9, 12, …
$ n_tokens_content <dbl> 501, 282, 324, 230, 419, 197, 1046, 207,…
$ n_unique_tokens <dbl> 0.5454545, 0.6214286, 0.5279503, 0.62100…
$ n_non_stop_words <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ n_non_stop_unique_tokens <dbl> 0.7716263, 0.7714286, 0.7569061, 0.76562…
$ num_hrefs <dbl> 6, 6, 6, 7, 7, 4, 9, 4, 8, 6, 5, 22, 19,…
$ num_self_hrefs <dbl> 1, 3, 2, 1, 1, 3, 4, 1, 0, 2, 2, 4, 0, 2…
$ num_imgs <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1…
$ num_videos <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1…
$ average_token_length <dbl> 4.758483, 4.570922, 4.919753, 4.673913, …
$ num_keywords <dbl> 4, 7, 6, 5, 8, 5, 7, 9, 5, 8, 4, 10, 7, …
$ data_channel_is_lifestyle <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
$ data_channel_is_entertainment <fct> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1…
$ data_channel_is_bus <fct> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
$ data_channel_is_socmed <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ data_channel_is_tech <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
$ data_channel_is_world <fct> 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0…
$ kw_min_min <dbl> 217, 217, 217, 217, 217, 217, 217, 217, …
$ kw_max_min <dbl> 5300, 5300, 5300, 1300, 593, 1100, 582, …
$ kw_avg_min <dbl> 1549.5000, 1134.6667, 1549.5000, 544.400…
$ kw_min_max <dbl> 7900, 0, 0, 2800, 0, 1300, 0, 0, 0, 0, 0…
$ kw_max_max <dbl> 37400, 37400, 37400, 37400, 37400, 37400…
$ kw_avg_max <dbl> 23925.00, 21271.43, 15950.00, 13480.00, …
$ kw_min_avg <dbl> 2381.743, 0.000, 0.000, 1465.478, 0.000,…
$ kw_max_avg <dbl> 6600.000, 6600.000, 6600.000, 2714.088, …
$ kw_avg_avg <dbl> 3579.694, 3024.592, 2386.463, 1953.027, …
$ self_reference_min_shares <dbl> 7900, 7900, 1400, 1900, 8600, 485, 1600,…
$ self_reference_max_shares <dbl> 7900, 7900, 5300, 1900, 8600, 1100, 1600…
$ self_reference_avg_sharess <dbl> 7900.0, 7900.0, 3350.0, 1900.0, 8600.0, …
$ weekday_is_monday <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ weekday_is_tuesday <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ weekday_is_wednesday <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ weekday_is_thursday <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ weekday_is_friday <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ weekday_is_saturday <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ weekday_is_sunday <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ is_weekend <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ LDA_00 <dbl> 0.05000033, 0.02895747, 0.03333421, 0.25…
$ LDA_01 <dbl> 0.05000934, 0.02917208, 0.03333801, 0.04…
$ LDA_02 <dbl> 0.79998636, 0.88295760, 0.86526361, 0.04…
$ LDA_03 <dbl> 0.05000038, 0.03031500, 0.03333397, 0.04…
$ LDA_04 <dbl> 0.05000359, 0.02859785, 0.03473021, 0.62…
$ global_subjectivity <dbl> 0.5284884, 0.3780702, 0.3623612, 0.37175…
$ global_sentiment_polarity <dbl> 0.01492248, 0.17807018, 0.07901374, 0.12…
$ global_rate_positive_words <dbl> 0.02594810, 0.03900709, 0.03395062, 0.03…
$ global_rate_negative_words <dbl> 0.035928144, 0.007092199, 0.018518519, 0…
$ rate_positive_words <dbl> 0.4193548, 0.8461538, 0.6470588, 0.66666…
$ rate_negative_words <dbl> 0.58064516, 0.15384615, 0.35294118, 0.33…
$ avg_positive_polarity <dbl> 0.4519231, 0.3803030, 0.3524833, 0.40312…
$ min_positive_polarity <dbl> 0.03333333, 0.05000000, 0.13636364, 0.10…
$ max_positive_polarity <dbl> 1.0, 1.0, 0.7, 0.9, 0.7, 1.0, 1.0, 0.9, …
$ avg_negative_polarity <dbl> -0.2481481, -0.4333333, -0.4000000, -0.2…
$ min_negative_polarity <dbl> -0.5000000, -0.7000000, -0.7500000, -0.6…
$ max_negative_polarity <dbl> -0.10000000, -0.16666667, -0.10000000, -…
$ title_subjectivity <dbl> 0.0000000, 0.0000000, 0.0000000, 0.62500…
$ title_sentiment_polarity <dbl> 0.00000000, 0.00000000, 0.00000000, -0.3…
$ abs_title_subjectivity <dbl> 0.500000000, 0.500000000, 0.500000000, 0…
$ abs_title_sentiment_polarity <dbl> 0.00000000, 0.00000000, 0.00000000, 0.37…
$ shares <int> 630, 2900, 2000, 909, 1900, 432, 3000, 5…
# Convert shares to a categorical variable with 3 levels
# thresh_1 = 1000
# thresh_2 = 3000
# data$Popularity <- ifelse(data$shares < thresh_1, 'Unpopular', ifelse(data$shares < thresh_2, 'Regular', 'Popular'))
# Convert expensive to a factor variable
# data$Popularity <- as.factor(data$Popularity)# Convert shares to a categorical variable with 2 levels
thresh = 5000
data$Popularity <- ifelse(data$shares >= thresh, 'Popular', 'Unpopular')
# Convert expensive to a factor variable
data$Popularity <- as.factor(data$Popularity)# Scale the data
numeric_vars <- data %>% select_if(is.numeric) %>% names()
numeric_cols <- data[, numeric_vars]
# Normalize the data
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
scaled_data <- as.data.frame(apply(numeric_cols, 2, normalize))
# Update the original data frame with the scaled data
data[, names(scaled_data)] <- scaled_dataVisualize the distribution of the data
# Plot distribution of all categorical variables
plot_bar(data)introduce(data) rows columns discrete_columns continuous_columns all_missing_columns
1 29211 62 16 46 0
total_missing_values complete_rows total_observations memory_usage
1 0 29211 1811082 16348376
plot_intro(data)plot_density(data)Drop constant numeric and character variables
# Identify numeric and factor variables
numeric_cols <- sapply(data, is.numeric)
factor_cols <- sapply(data, is.factor)
# Calculate variance for numeric variables
variance <- apply(data[, numeric_cols], 2, var)
# Filter numeric variables with variance >= 0
selected_numeric_cols <- names(variance[variance >= 0])
# Combine selected numeric and factor variables
selected_cols <- c(selected_numeric_cols, names(data)[factor_cols])
# Subset the dataset with selected columns
data_filtered <- data[selected_cols]# Check for correlation between numeric variables
numeric_vars <- data_filtered %>% select_if(is.numeric) %>% names()
numeric_cols <- data_filtered[, numeric_vars]
# Calculate the correlation matrix with numerical variables
correlation_matrix <- abs(cor(numeric_cols)) # Absolute value of the correlation matrix
# Plot the correlation matrix
corrplot(correlation_matrix,
method = "circle",
number.cex = 0.2,
tl.srt = 90,
tl.cex = 0.4,
order = "hclust",
type = "upper",
tl.col = "black")# Find highly correlated variables
highly_correlated <- findCorrelation(correlation_matrix, cutoff = 0.8)
highly_correlated_vars <- names(numeric_cols)[highly_correlated]
# Remove highly correlated variables
clean_df <- data_filtered[, !names(data_filtered) %in% highly_correlated_vars]
# Display the structure of the clean dataset
head(clean_df) timedelta n_tokens_title n_tokens_content n_unique_tokens num_hrefs
357 1 0.5555556 0.22237017 0.6798419 0.10714286
358 1 0.3333333 0.12516644 0.7745342 0.10714286
360 1 0.4444444 0.14380826 0.6580250 0.10714286
362 1 0.6666667 0.10208611 0.7740057 0.12500000
367 1 0.5555556 0.18597426 0.7135278 0.12500000
369 1 0.7777778 0.08743897 0.9110572 0.07142857
num_self_hrefs num_imgs num_videos average_token_length num_keywords
357 0.05 0.02702703 0.00000000 0.8741536 0.1428571
358 0.15 0.00000000 0.04761905 0.8396979 0.5714286
360 0.10 0.02702703 0.00000000 0.9037796 0.4285714
362 0.05 0.02702703 0.00000000 0.8586177 0.2857143
367 0.05 0.02702703 0.00000000 0.8694166 0.7142857
369 0.15 0.00000000 0.04761905 0.8345955 0.2857143
kw_min_min kw_avg_min kw_min_max kw_avg_max kw_min_avg kw_max_avg
357 1 0.9110763 0.037926068 0.0174457860 0.6782197 0.19016072
358 1 0.6673196 0.000000000 0.0131849816 0.0000000 0.19016072
360 1 0.9110763 0.000000000 0.0046404346 0.0000000 0.19016072
362 1 0.3204779 0.013442151 0.0006743884 0.4173063 0.00719767
367 1 0.2196161 0.000000000 0.0022720467 0.0000000 0.09902242
369 1 0.2993243 0.006240999 0.0134235408 0.3417093 0.05323524
kw_avg_avg self_reference_min_shares self_reference_avg_sharess LDA_00
357 0.39624526 0.14877589 0.11861862 0.03409265
358 0.29801767 0.14877589 0.11861862 0.01017891
360 0.18509803 0.02636535 0.05030030 0.01515276
362 0.10839972 0.03578154 0.02852853 0.26762898
367 0.03670889 0.16195857 0.12912913 0.54349834
369 0.19787391 0.00913371 0.01189940 0.02301291
LDA_01 LDA_02 LDA_03 LDA_04 global_subjectivity
357 0.034668350 0.87541986 0.033677650 0.033355335 0.7715159
358 0.010595403 0.96854291 0.011579143 0.009557931 0.5519273
360 0.015408226 0.94868401 0.014968193 0.016375456 0.5289944
362 0.023105392 0.02246754 0.022451440 0.672018575 0.5427142
367 0.005780248 0.45649340 0.005615495 0.005572540 0.5484982
369 0.690673205 0.26966710 0.024273850 0.022252694 0.5610853
global_sentiment_polarity global_rate_positive_words
357 0.2616927 0.3047394
358 0.5981195 0.4581065
360 0.3938553 0.3987224
362 0.4867741 0.4084934
367 0.4475931 0.3924072
369 0.6420019 0.7749970
global_rate_negative_words rate_positive_words avg_positive_polarity
357 0.6866267 0.4193548 0.7532051
358 0.1355398 0.8461538 0.6338384
360 0.3539095 0.6470588 0.5874721
362 0.3323671 0.6666667 0.6718750
367 0.2736675 0.7000000 0.5964286
369 0.1940214 0.8666667 0.5879953
min_positive_polarity max_positive_polarity avg_negative_polarity
357 0.08333333 1.0 0.5948602
358 0.12500000 1.0 0.2925170
360 0.34090909 0.7 0.3469388
362 0.25000000 0.9 0.5952381
367 0.25000000 0.7 0.5816327
369 0.25000000 1.0 0.5714286
min_negative_polarity max_negative_polarity title_subjectivity
357 0.50 0.8000000 0.000
358 0.30 0.6666667 0.000
360 0.25 0.8000000 0.000
362 0.40 0.8000000 0.625
367 0.50 0.8000000 0.200
369 0.60 0.7500000 0.300
title_sentiment_polarity abs_title_subjectivity
357 0.4117647 1.00
358 0.4117647 1.00
360 0.4117647 1.00
362 0.1911765 0.25
367 0.4852941 0.60
369 1.0000000 0.40
abs_title_sentiment_polarity shares data_channel_is_lifestyle
357 0.000 0.0007411404 0
358 0.000 0.0034329624 0
360 0.000 0.0023657202 0
362 0.375 0.0010719855 0
367 0.125 0.0022471377 0
369 1.000 0.0005063471 0
data_channel_is_entertainment data_channel_is_bus data_channel_is_socmed
357 0 0 0
358 0 0 0
360 0 0 0
362 0 1 0
367 0 0 0
369 1 0 0
data_channel_is_tech data_channel_is_world weekday_is_monday
357 0 1 1
358 0 1 1
360 0 1 1
362 0 0 1
367 0 1 1
369 0 0 1
weekday_is_tuesday weekday_is_wednesday weekday_is_thursday
357 0 0 0
358 0 0 0
360 0 0 0
362 0 0 0
367 0 0 0
369 0 0 0
weekday_is_friday weekday_is_saturday weekday_is_sunday is_weekend
357 0 0 0 0
358 0 0 0 0
360 0 0 0 0
362 0 0 0 0
367 0 0 0 0
369 0 0 0 0
Popularity
357 Unpopular
358 Unpopular
360 Unpopular
362 Unpopular
367 Unpopular
369 Unpopular
# View the number of levels of each of the factor variables in the data
factor_cols <- sapply(clean_df, is.factor)
factor_col_names <- names(factor_cols[factor_cols])
levels_count <- sapply(data[, factor_col_names], function(x) length(levels(x)))
levels_count_df <- data.frame(levels_count)
print(levels_count_df) levels_count
data_channel_is_lifestyle 2
data_channel_is_entertainment 2
data_channel_is_bus 2
data_channel_is_socmed 2
data_channel_is_tech 2
data_channel_is_world 2
weekday_is_monday 2
weekday_is_tuesday 2
weekday_is_wednesday 2
weekday_is_thursday 2
weekday_is_friday 2
weekday_is_saturday 2
weekday_is_sunday 2
is_weekend 2
Popularity 2
# Check for missing values in the data
any(is.na(clean_df))[1] FALSE
Distribution Tables and Visualizations
# Show a popularity distribution table
popularity_table <- table(clean_df$Popularity)
popularity_table
Popular Unpopular
3544 25667
# Plot the popularity distribution
barplot(popularity_table, col = "lightblue", main = "Popularity Distribution", xlab = "Popularity", ylab = "Frequency")# Create a dataframe to store the channel distribution
Channel_distr <- data.frame(
Channel = character(),
Yes = numeric(),
No = numeric(),
stringsAsFactors = FALSE
)
channels <- c("data_channel_is_lifestyle", "data_channel_is_entertainment", "data_channel_is_bus",
"data_channel_is_socmed", "data_channel_is_tech", "data_channel_is_world")
channel_type <- c("Lifestyle", "Entertainment", "Business", "Social Media", "Technology", "World")
# Calculate the distribution of each channel
for (i in 1:length(channels)) {
channel <- channels[i]
yes_count <- sum(clean_df[[channel]] == 1)
no_count <- sum(clean_df[[channel]] == 0)
Channel_distr <- rbind(Channel_distr, data.frame(Channel = channel_type[i], Yes = yes_count, No = no_count))
}
rownames(Channel_distr) <- NULL
# Display the channel distribution table
Channel_distr Channel Yes No
1 Lifestyle 1627 27584
2 Entertainment 4971 24240
3 Business 4820 24391
4 Social Media 1691 27520
5 Technology 5960 23251
6 World 6682 22529
# plot the channel distribution table
gather(Channel_distr, key = "Response", value = "Count", -Channel) %>%
ggplot(aes(x = Channel, y = Count, fill = Response)) +
geom_bar(stat = "identity", position = 'dodge') +
labs(title = "Channel Distribution", x = "Channel", y = "Count") +
scale_fill_manual(values = c("Yes" = "darkgreen", "No" = "maroon")) + # Specify colors for "Yes" and "No" bars
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# Create a dataframe to store the day distribution
Day_distr <- data.frame(
Day = character(),
Yes = numeric(),
No = numeric(),
stringsAsFactors = FALSE
)
days <- c("weekday_is_monday", "weekday_is_tuesday", "weekday_is_wednesday",
"weekday_is_thursday", "weekday_is_friday", "weekday_is_saturday", "weekday_is_sunday", "is_weekend")
day_name <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", "Weekend")
# Calculate the distribution of each day
for (i in 1:length(days)) {
day <- days[i]
yes_count <- sum(clean_df[[day]] == 1)
no_count <- sum(clean_df[[day]] == 0)
Day_distr <- rbind(Day_distr, data.frame(Day = day_name[i], Yes = yes_count, No = no_count))
}
rownames(Day_distr) <- NULL
# Display the day distribution table
Day_distr Day Yes No
1 Monday 4929 24282
2 Tuesday 5525 23686
3 Wednesday 5548 23663
4 Thursday 5372 23839
5 Friday 4175 25036
6 Saturday 1811 27400
7 Sunday 1851 27360
8 Weekend 3662 25549
# plot the day distribution table
gather(Day_distr, key = "Response", value = "Count", -Day) %>%
ggplot(aes(x = Day, y = Count, fill = Response)) +
geom_bar(stat = "identity", position = 'dodge') +
labs(title = "Day Distribution", x = "Day", y = "Count") +
scale_fill_manual(values = c("Yes" = "darkgreen", "No" = "maroon")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# plot relationship between variables
ggplot(clean_df, aes(x = shares, y = global_subjectivity)) +
geom_point(color = "purple", alpha = 0.5) +
labs(title = "Scatterplot of shares Vs global_subjectivity",
y = "Global Subjectivity",
x = "Shares")ggplot(clean_df, aes(x = n_tokens_content, y = min_negative_polarity)) +
geom_point(color = "pink", alpha = 0.5) +
labs(title = "Scatterplot of Shares Vs Min Negative Polarity",
y = "Min Negative Polarity",
x = "Shares")ggplot(clean_df, aes(x = shares, y = rate_positive_words)) +
geom_point(alpha = 0.5) +
labs(title = "Scatterplot of Shares Vs Positive Word Rate",
y = "Positive Word Rate",
x = "Shares")ggplot(clean_df, aes(x = shares, y = num_videos)) +
geom_point(alpha = 0.5) +
labs(title = "Scatterplot of Shares Vs Number of Videos",
y = "Number of Videos",
x = "Shares")ggplot(clean_df, aes(x = shares, y = num_imgs)) +
geom_point(alpha = 0.5) +
labs(title = "Scatterplot of Shares Vs Number of Images",
y = "Number of Images",
x = "Shares")ggplot(clean_df, aes(x = weekday_is_friday, fill = Popularity)) +
geom_bar(position = "fill") +
labs(y = "Article Popularity",
x = "Friday",
title = "Popularity of Articles published on Friday")ggplot(clean_df, aes(x = is_weekend, fill = Popularity)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(y = "Article Popularity",
x = "Weekend",
title = "Popularity of Articles published on the Weekend")ggplot(clean_df, aes(x = data_channel_is_socmed, fill = Popularity)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(x = "Social Media Channel",
y = "Article Popularity",
title = "Popularity of Articles on Social Media")ggplot(clean_df, aes(x = data_channel_is_lifestyle, fill = Popularity)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(x = "Lifestyle Channel",
y = "Article Popularity",
title = "Popularity of Articles on Lifestyle Channel")# Define regression and classification datasets
reg_df <- clean_df %>% select(-Popularity)
class_df <- clean_df %>% select(-shares)2 Model Development - Regression
2.1 Model Selection
The models selected are as follows:
2.1.1 Random Forest (RF)
RF is an ensemble learning method that builds multiple decision trees and averages their predictions to improve accuracy and robustness.
- Advantages
Robustness:RF is less prone to overfitting compared to many other algorithms because it builds multiple decision trees and averages their predictions.Handles non-linear relationships well:RF can capture complex interactions and non-linear relationships between features and the target variable.- Feature importance: RF provides a measure of feature importance, which can help identify the key factors driving article popularity.
Handles categorical features naturally:RF can handle categorical features without the need for one-hot encoding.
- Disadvantages
Computationally expensive:Training multiple decision trees can be time-consuming and resource-intensive, especially with large datasets.Lack of interpretability:While RF provides feature importance, the individual trees’ predictions are difficult to interpret compared to simpler models like linear regression.
2.1.2 Gradient Boosting Machine (GBM)
GBM is another ensemble learning method that builds decision trees sequentially, with each tree correcting the errors of the previous trees.
- Advantages
High predictive accuracy:GBM sequentially builds trees, each one correcting errors of the previous trees, leading to high predictive accuracy.Handles missing data:GBM can handle missing data well by using surrogate splits.Feature importance:Like RF, GBM provides feature importance, aiding in understanding which features drive article popularity.Robustness to outliers:GBM’s robustness to outliers can be advantageous in datasets where extreme values may exist.
- Disadvantages
Potential overfitting:GBM can overfit if not properly tuned, especially with deep trees or insufficient regularization.Computationally expensive:Similar to RF, GBM training can be computationally expensive, especially with large datasets and complex models.Hyperparameter tuning:GBM requires careful tuning of hyperparameters such as learning rate, tree depth, and regularization parameters, which can be time-consuming.
2.1.3 Extreme Gradient Boosting (XGBoost)
XGBoost is an optimized implementation of GBM that offers improved performance and efficiency. It uses a more regularized model formalization to control overfitting and parallel processing to speed up training.
- Advantages
Computational efficiency:XGBoost is optimized for speed and efficiency, making it faster than traditional GBM implementations.Regularization:XGBoost includes regularization techniques like L1 and L2 regularization to prevent overfitting.Parallel processing:XGBoost can leverage parallel processing capabilities, leading to faster training times.Flexibility:XGBoost supports various objective functions and evaluation metrics, allowing customization for different regression tasks.
- Disadvantages
Tuning complexity:While XGBoost provides default parameters, fine-tuning them for optimal performance can be complex.Black-box nature:Like GBM, XGBoost models can be challenging to interpret due to their ensemble nature and complex interactions between features.Sensitivity to hyperparameters:While XGBoost is less sensitive to some hyperparameters compared to GBM, it still requires careful tuning for optimal performance.
2.1.4 Linear Regression (Benchmark Model)
Linear regression is a simple and interpretable model that assumes a linear relationship between features and the target variable. It serves as a benchmark model for comparison with more complex algorithms because of its simplicity and ease of interpretation.
- Advantages
Simplicity and interpretability:Linear regression provides a straightforward interpretation of the relationship between each feature and the target variable.Fast training:Linear regression typically trains quickly, even on large datasets.Less prone to overfitting:Linear regression’s simplicity makes it less prone to overfitting compared to more complex models.
- Disadvantages
Limited flexibility:Linear regression assumes a linear relationship between features and the target variable, which may not capture complex interactions.Limited performance with non-linear data:Linear regression may underperform when the relationship between features and the target variable is non-linear.Vulnerability to outliers:Linear regression can be sensitive to outliers, which can skew the model’s predictions.
2.2 Model Training
Here we will train the selected models on the regression dataset and evaluate their performance using the Root Mean Squared Percentage Error (RMSPE) metric.
2.2.1 Benchmark Model - Linear Regression
set.seed(100)
# Use the dataset with the target variable as a number
n = nrow(reg_df)
# calculate the RMSPE for the LPM model
RMSPE_lpm <- c()
for (j in 1:100) {
set.seed(j)
# Split the data into training and testing sets using bootstrap sampling
spl_reg <- unique(sample(n, n, replace = TRUE))
mdata_reg <- reg_df[spl_reg, ]
test_reg <- reg_df[-spl_reg, ]
model_lpm <- lm(shares ~ ., data = mdata_reg)
phat_lpm <- predict(model_lpm, test_reg)
RMSPE_lpm[j] <- sqrt(mean((test_reg$shares - phat_lpm)^2))
}
cat("Test RMSPE for LPM: ", mean(RMSPE_lpm), "\n")Test RMSPE for LPM: 0.01172372
cat("95% CI for LPM is between: ", quantile(RMSPE_lpm, c(0.025)), "to", quantile(RMSPE_lpm, c(0.975)), "\n")95% CI for LPM is between: 0.007966541 to 0.01610216
plot(RMSPE_lpm, pch = 19, col = "blue", xlab = "Number of loops", ylab = "RMSPE", main = "LPM RMSPE with 95% CI")
abline(h = mean(RMSPE_lpm), lwd = 2, lty = 2, col = "red")
abline(h = quantile(RMSPE_lpm, 0.025), lwd = 2, lty = 2, col = "green")
abline(h = quantile(RMSPE_lpm, 0.975), lwd = 2, lty = 2, col = "green")2.2.2 Random Forest Model
Out Of Bag(OOB) Error
set.seed(100)
n = nrow(reg_df)
# Select 25% of the data as a small sample
sample_size = 0.25
sample_idx <- sample(1:n, n * sample_size, replace = FALSE)
sample_data <- reg_df[sample_idx, ]
reg_RF <- randomForest(shares ~ .,
data = sample_data,
ntree = 1000,
importance = TRUE,
localImp = TRUE)
# Print the model
reg_RF
Call:
randomForest(formula = shares ~ ., data = sample_data, ntree = 1000, importance = TRUE, localImp = TRUE)
Type of random forest: regression
Number of trees: 1000
No. of variables tried at each split: 17
Mean of squared residuals: 0.0001197699
% Var explained: -2.69
# OOB error
head(reg_RF$mse)[1] 0.0006196374 0.0004885495 0.0004044670 0.0003034350 0.0002346683
[6] 0.0002182462
tail(reg_RF$mse)[1] 0.0001197815 0.0001197744 0.0001197669 0.0001197538 0.0001197619
[6] 0.0001197699
# RMSE
rmse_pred <- predict(reg_RF, newdata = reg_df)
RMSE_RF <- sqrt(mean((reg_df$shares - rmse_pred)^2))
OOB_RMSE_RF <- reg_RF$mse[length(reg_RF$mse)]
# Print the RMSE
cat(" ", "\n")cat("OOB RMSE for Random Forest: ", OOB_RMSE_RF, "\n")OOB RMSE for Random Forest: 0.0001197699
# Plot the OOB error
plot(reg_RF,
type = "l",
col = "purple",
main = "Random Forest Model",
lwd = 2)Test RMSPE with 95% confidence interval
num_cores <- detectCores() - 1
cl <- makeCluster(num_cores)
registerDoParallel(cl)
rf_reg <- list()
test_RF_RMSE <- foreach(i = 1:100, .combine = c, .packages = c('randomForest', 'pROC')) %dopar% {
set.seed(i)
samp_data <- reg_df[sample(nrow(reg_df), nrow(reg_df)*0.25, replace = FALSE), ]
nr = nrow(samp_data)
train_idx_reg <- unique(sample(nr, nr, replace = TRUE))
test_idx_reg <- setdiff(1:nr, train_idx_reg)
train_data <- samp_data[train_idx_reg, ]
test_data <- samp_data[test_idx_reg, ]
rf_test <- randomForest(shares ~ .,
data = train_data,
ntree = 1000,
importance = TRUE,
localImp = TRUE)
rf_reg[[i]] <- rf_test
p_test <- predict(rf_test, test_data)
sqrt(mean((test_data$shares - p_test)^2))
}
stopCluster(cl)
# Calculate the mean and 95% CI
cat("Test mean RMSPE:", mean(test_RF_RMSE), "\n")Test mean RMSPE: 0.01199064
cat("95% CI for RMSPE is between:", quantile(test_RF_RMSE, 0.025), "and", quantile(test_RF_RMSE, 0.975), "\n")95% CI for RMSPE is between: 0.006850634 and 0.02236891
# Plot the test RMSPEs
plot(test_RF_RMSE, pch = 19, col = "skyblue", xlab = "Number of iterations", ylab = "RMSPE", main = "RMSPE on Test Set")
abline(h = mean(test_RF_RMSE), lwd = 2, lty = 2, col = "red")
abline(h = quantile(test_RF_RMSE, 0.025), lwd = 2, lty = 2, col = "green")
abline(h = quantile(test_RF_RMSE, 0.975), lwd = 2, lty = 2, col = "green")2.2.3 Gradient Boosting(GBM)
First we create a hyperparameter grid
# create hyper-parameter grid
grid <- expand.grid( # 16 rows to save time
shrinkage = c(0.01, 0.02, 0.05, 0.1),
interaction.depth = c(1, 3, 5, 7),
min_RMSE = NA,
optimal_trees = NA
)
# total number of combinations
nrow(grid)[1] 16
Next, we find the best hyperparameters…
library(gbm)
# Find the best hyperparameters
set.seed(100)
n = nrow(reg_df)
# test/train split
ind_best <- sample(n, n * 0.8)
share_model <- reg_df[ind_best, ]
share_test <- reg_df[-ind_best, ]
for (i in 1:nrow(grid)) {
model_g <- gbm(shares ~ .,
data = share_model,
distribution = "gaussian",
n.trees = 1500,
interaction.depth = grid$interaction.depth[i],
shrinkage = grid$shrinkage[i],
cv.folds = 10,
n.cores = 12)
grid$min_RMSE[i] <- min(model_g$cv.error)
grid$optimal_trees[i] <- which.min(model_g$cv.error)
}
# reporting the best parameters
best_index <- which.min(grid$min_RMSE)
best_params <- grid[best_index, ]
best_params shrinkage interaction.depth min_RMSE optimal_trees
4 0.1 1 0.0001280923 155
Test RMSE with 95% confidence interval
# now feed the best parameter into the model using train and test sets
RMSE_gbm = c()
n <- nrow(reg_df)
gbm_reg <- list()
for (i in 1:100) {
set.seed(i)
ind <- unique(sample(n, n, replace = TRUE)) # initial split
m_dt <- reg_df[ind, ]
test_gbm <- reg_df[-ind, ]
finalModel <- gbm(shares ~ .,
data = m_dt,
distribution = "gaussian",
n.trees = best_params$optimal_tree,
interaction.depth = best_params$interaction.depth,
shrinkage = best_params$shrinkage,
n.cores = 12)
gbm_reg[[i]] <- finalModel
phat_gbm <- predict(finalModel, test_gbm)
# RMSPE
RMSE_gbm[i] <- sqrt(mean((test_gbm$shares - phat_gbm)^2))
}
cat("Test RMSE for GBM: ", mean(RMSE_gbm), "\n")Test RMSE for GBM: 0.01175028
cat("95% CI for GBM: ", quantile(RMSE_gbm, c(0.025, 0.975)), "\n")95% CI for GBM: 0.00801354 0.01612405
plot(RMSE_gbm, pch = 19, col = "blue", xlab = "Iteration", ylab = "RMSE", main = "GBM RMSE with 95% CI")
abline(h = mean(RMSE_gbm), lwd = 2, lty = 2, col = "red")
abline(h = quantile(RMSE_gbm, 0.025), lwd = 2, lty = 2, col = "green")
abline(h = quantile(RMSE_gbm, 0.975), lwd = 2, lty = 2, col = "green")2.2.4 Extreme Gradient Boosting(XGBoost)
library(xgboost)
# Prepare the data
set.seed(100)
n <- nrow(reg_df)
ind_xgb <- sample(n, n, replace = TRUE)
train_xgb <- reg_df[ind_xgb, ]
test_xgb <- reg_df[-ind_xgb, ]
# One-hot coding using R's `model.matrix`
train_ <- train_xgb$shares
test_ <- test_xgb$shares
htrain_ <- model.matrix(~. -1, data = train_xgb[,-which(names(train_xgb) == "shares")])
htest_ <- model.matrix(~. -1, data = test_xgb[,-which(names(test_xgb) == "shares")])
# Convert the matrices to DMatrix objects
dtrain <- xgb.DMatrix(data = htrain_, label = train_)
dtest <- xgb.DMatrix(data = htest_, label = test_)
# Define the parameter grid
param_grid <- expand.grid(
eta = c(0.01, 0.02, 0.05, 0.1),
max_depth = c(1, 3, 5, 7),
min_child_weight = c(1, 2)
)
best_xrmse <- 0
best_xparams <- list()
best_xnround <- NULL
for (i in 1:nrow(param_grid)) { # Using just 18 iterations to save time
params <- list(
booster = "gbtree",
objective = "reg:squarederror",
eta = param_grid$eta[i],
max_depth = param_grid$max_depth[i],
min_child_weight = param_grid$min_child_weight[i],
eval_metric = "rmse"
)
# Perform cross-validation
xgb_cv <- xgb.cv(params = params,
data = dtrain,
nrounds = 1500,
nfold = 2,
stratified = TRUE,
maximize = FALSE,
verbose = 0) # Suppress verbose output
min_rmse <- min(xgb_cv$evaluation_log$test_rmse_mean)
best_xnround <- which.min(xgb_cv$evaluation_log$test_rmse_mean)
if (min_rmse < best_xrmse) {
best_xrmse <- min_rmse
best_xparams <- params
best_xnround <- best_xnround
}
}
RMSE_xgb <- c()
for (l in 1:100) {
set.seed(l)
n <- nrow(reg_df)
ind_xgb <- unique(sample(n, n, replace = TRUE))
train_xgb <- reg_df[ind_xgb, ]
test_xgb <- reg_df[-ind_xgb, ]
# One-hot coding using R's `model.matrix`
train_ <- train_xgb$shares
test_ <- test_xgb$shares
htrain_ <- model.matrix(~. -1, data = train_xgb[,-which(names(train_xgb) == "shares")])
htest_ <- model.matrix(~. -1, data = test_xgb[,-which(names(test_xgb) == "shares")])
# Convert the matrices to DMatrix objects
dtrain <- xgb.DMatrix(data = htrain_, label = train_)
dtest <- xgb.DMatrix(data = htest_, label = test_)
# Train the final model with the best parameters
finalModel_xgb <- xgb.train(data = dtrain,
params = best_xparams,
nrounds = best_xnround,
verbose = 0)
# RMSE calculation
phat_xgb <- predict(finalModel_xgb, dtest)
RMSE_xgb[l] <- sqrt(mean((test_xgb$shares - phat_xgb)^2))
}
cat("Test RMSE for XGB: ", mean(RMSE_xgb), "\n")Test RMSE for XGB: 0.0126912
cat("95% CI for XGB: ", quantile(RMSE_xgb, c(0.025, 0.975)), "\n")95% CI for XGB: 0.009274141 0.01651989
plot(RMSE_xgb, pch = 19, col = "blue", xlab = "Iteration", ylab = "RMSE", main = "XGB RMSE with 95% CI")
abline(h = mean(RMSE_xgb), lwd = 2, lty = 2, col = "red")
abline(h = quantile(RMSE_xgb, 0.025), lwd = 2, lty = 2, col = "green")
abline(h = quantile(RMSE_xgb, 0.975), lwd = 2, lty = 2, col = "green")2.3 Model Evaluation
# Compare the RMSE of the models
cat("Benchmark Model (Linear Regression) RMSE: ", mean(RMSPE_lpm), "\n")Benchmark Model (Linear Regression) RMSE: 0.01172372
cat("Random Forest Model RMSE: ", mean(test_RF_RMSE), "\n")Random Forest Model RMSE: 0.01199064
cat("Gradient Boosting Model RMSE: ", mean(RMSE_gbm), "\n")Gradient Boosting Model RMSE: 0.01175028
cat("XGBoost Model RMSE: ", mean(RMSE_xgb), "\n")XGBoost Model RMSE: 0.0126912
# Put the RMSE values in a data frame
RMSE_values <- data.frame(Model = c("LPM", "Random Forest", "Gradient Boosting", "XGBoost"),
RMSE = c(mean(RMSPE_lpm), mean(test_RF_RMSE), mean(RMSE_gbm), mean(RMSE_xgb)),
Confidence_Int = c(paste0(quantile(RMSPE_lpm, c(0.025)), " - ", quantile(RMSPE_lpm, 0.975)),
paste0(quantile(test_RF_RMSE, c(0.025)), " - ", quantile(test_RF_RMSE, 0.975)),
paste0(quantile(RMSE_gbm, c(0.025)), " - ", quantile(RMSE_gbm, 0.975)),
paste0(quantile(RMSE_xgb, c(0.025)), " - ", quantile(RMSE_xgb, 0.975))))
RMSE_values Model RMSE Confidence_Int
1 LPM 0.01172372 0.00796654098064505 - 0.0161021610956817
2 Random Forest 0.01199064 0.0068506342598369 - 0.0223689092147952
3 Gradient Boosting 0.01175028 0.00801354013378231 - 0.0161240459874838
4 XGBoost 0.01269120 0.00927414122641074 - 0.016519888577437
The GBM model has the lowest RMSE on average, indicating that it performs the best among the models evaluated. When considering the 95% confidence intervals, the GBM model’s performance is significantly better than the other models, with a narrower range of RMSE values. This suggests that the GBM model is more consistent in its predictions compared to the other models. Comparing its performance to the benchmark LPM model, the GBM model does not outperform it by a considerable margin, indicating that the benchmark model is relatively competitive in this context.
3 Model Development - Classification
3.1 Model Selection
The models selected for the classification task are as follows:
3.1.1 Random Forest (RF)
RF is an ensemble learning method that builds multiple decision trees and averages their predictions to improve accuracy and robustness.
- Advantages
Ensemble learning:RF combines multiple decision trees, making it robust against overfitting and noisy data.Handles high-dimensional data:RF can handle a large number of input features without overfitting, making it suitable for text-based classification tasks common in article popularity prediction.Handles categorical features naturally:RF can handle categorical features without the need for one-hot encoding.
- Disadvantages
Sensitivity to noise:ALthough RF is resistant to overfitting, it can be sensitive to noisy data, leading to suboptimal performance if the dataset contains irrelevant or misleading features.
3.1.2 Gradient Boosting Machine (GBM)
GBM is an ensemble learning method that builds decision trees sequentially, with each tree correcting the errors of the previous trees.
- Advantages
Reducing bias:They reduce bias in model predictions through their ensemble learning approach, iterative error correction, regularization techniques, and the combination of weak learners.Scalable:These algorithms develop base learners sequentially, making them scalable to large datasets commonly encountered in article popularity prediction tasks. They can efficiently handle a vast amount of article-related data during both training and inference stages.
- Disadvantages
Difficulty with extrapolation:While extrapolation is crucial for predicting outcomes outside the training data range, it can be challenging for classification models like GBMs when applied to article popularity prediction. These models may struggle to accurately predict the popularity of articles with characteristics significantly different from those in the training data.Data requirements and limitations:GBMs, in particular, typically require a substantial amount of training data to learn intricate patterns and make accurate predictions in article classification tasks. Limited or insufficient data may hinder their ability to effectively capture the complexities of article popularity dynamics, leading to suboptimal performance.
3.1.3 Extreme Gradient Boosting (XGBoost)
XGBoost is an optimized implementation of GBM that offers improved performance and efficiency. It uses a more regularized model formalization to control overfitting and parallel processing to speed up training.
- Advantages
High accuracy:XGBoost is known for its high accuracy, making it a popular choice for machine learning tasks that require high precision. It works by combining multiple decision trees to make more accurate predictions, making it effective for tasks such as image and speech recognition, natural language processing, and recommendation systems.Speed:XGBoost is designed to be fast and efficient, even for large datasets. It is optimized for both single- and multi-core processing, making it an excellent choice for tasks that require fast predictions.
- Disadvantages
Black-box nature:Like GBM, XGBoost models can be challenging to interpret due to their ensemble nature and complex interactions between features. This can make it challenging to troubleshoot and fine-tune.
3.2 Model Training
Here we will train the selected models on the classification dataset and evaluate their performance using the Area Under the Receiver Operating Characteristic Curve (AUC) metric.
3.2.1 Benchmark Model - LPM
set.seed(100)
class_lpm <- class_df
class_lpm$Popularity <- as.numeric(class_lpm$Popularity) - 1
# Use the dataset with the target variable as a number
n = nrow(class_lpm)
# calculate the AUC for the LPM model
AUC_lpm <- c()
for (j in 1:100) {
set.seed(j)
spl <- unique(sample(n, n, replace = TRUE))
mdata <- class_lpm[spl, ]
test <- class_lpm[-spl, ]
model_lpm <- lm(Popularity ~ ., data = mdata)
phat_lpm <- predict(model_lpm, test, type = "response")
phat_lpm <- pmax(0, pmin(1, phat_lpm))
pred_rocr <- prediction(phat_lpm, test$Popularity)
auc_ROCR <- performance(pred_rocr, measure = "auc")
AUC_lpm[j] <- auc_ROCR@y.values[[1]]
}
cat("Test AUC for LPM: ", mean(AUC_lpm), "\n")Test AUC for LPM: 0.7015135
cat("95% CI for LPM: ", quantile(AUC_lpm, c(0.025, 0.975)), "\n")95% CI for LPM: 0.6892303 0.714133
plot(AUC_lpm, pch = 19, col = "blue", xlab = "Iteration", ylab = "AUC", main = "LPM AUC with 95% CI")
abline(h = mean(AUC_lpm), lwd = 2, lty = 2, col = "red")
abline(h = quantile(AUC_lpm, 0.025), lwd = 2, lty = 2, col = "green")
abline(h = quantile(AUC_lpm, 0.975), lwd = 2, lty = 2, col = "green")3.2.2 Random Forest
set.seed(100)
num_cores <- detectCores() - 1
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Use the dataset with the target variable as a factor
n = nrow(class_df)
rf_class <- list()
test_AUC <- foreach(i = 1:100, .combine = c, .packages = c('randomForest', 'pROC')) %dopar% {
set.seed(i)
train_idx <- sample(n, n, replace = TRUE)
test_idx <- setdiff(1:n, train_idx)
train <- class_df[train_idx, ]
test <- class_df[test_idx, ]
rf_test <- randomForest(Popularity ~ .,
data = train,
ntree = 1000,
importance = TRUE,
localImp = TRUE)
rf_class[[i]] <- rf_test
p_test <- predict(rf_test, test, type = "prob")[,2]
roc_curve <- roc(test$Popularity, p_test)
auc(roc_curve)
}
stopCluster(cl)
# Calculate the mean and 95% CI
cat("Mean Test AUC: ", mean(test_AUC), "\n")Mean Test AUC: 0.6984085
cat("95% CI: ", quantile(test_AUC, c(0.025, 0.975)), "\n")95% CI: 0.686274 0.7099416
plot(test_AUC, pch = 19, col = "blue", xlab = "Iteration", ylab = "AUC", main = "RF Test AUCs")
abline(h = mean(test_AUC), lwd = 2, lty = 2, col = "red")
abline(h = quantile(test_AUC, 0.025), lwd = 2, lty = 2, col = "green")
abline(h = quantile(test_AUC, 0.975), lwd = 2, lty = 2, col = "green")3.2.3 Gradient Boosting(GBM)
First we go through the grid to find the best hyperpaqrameters…
n = nrow(class_lpm)
# test/train split
ind_best <- unique(sample(n, n, replace = TRUE))
pop_model <- class_lpm[ind_best, ]
pop_test <- class_lpm[-ind_best, ]
for (i in 1:nrow(grid)) {
gbm_class <- gbm(Popularity ~ .,
data = pop_model,
distribution = "bernoulli",
n.trees = 1500,
interaction.depth = grid$interaction.depth[i],
shrinkage = grid$shrinkage[i],
cv.folds = 10,
n.cores = 12)
grid$min_RMSE[i] <- min(gbm_class$cv.error)
grid$optimal_trees[i] <- which.min(gbm_class$cv.error)
}
# reporting the best parameters
best_index_ <- which.min(grid$min_RMSE)
best_params_ <- grid[best_index_, ]
best_params_ shrinkage interaction.depth min_RMSE optimal_trees
10 0.02 5 0.6797079 804
We find the best AUC…
# find the best AUC
gbm_best <- gbm(Popularity ~ ., data = pop_model, distribution = "bernoulli",
n.trees = best_params_$optimal_trees,
interaction.depth = best_params_$interaction.depth,
shrinkage = best_params_$shrinkage,
n.cores = 7)
phat_gbm_best <- predict(gbm_best, pop_test, type = "response")Using 804 trees...
# AUC
pred_best <- prediction(phat_gbm_best, pop_test$Popularity)
perf_best <- performance(pred_best, "auc")
Best_AUC <- perf_best@y.values[[1]]
Best_AUC[1] 0.7279203
Next, we calculate the test AUC with 95% confidence interval…
Class_AUC_gbm = c()
n <- nrow(class_lpm)
gbm_list <- list()
for (i in 1:100) {
set.seed(i)
ind <- unique(sample(n, n, replace = TRUE))
m_dt <- class_lpm[ind, ]
test_gbm <- class_lpm[-ind, ]
finalModel_ <- gbm(Popularity ~ .,
data = m_dt,
distribution = "bernoulli",
n.trees = best_params_$optimal_tree,
interaction.depth = best_params_$interaction.depth,
shrinkage = best_params_$shrinkage,
n.cores = 12)
gbm_list[[i]] <- finalModel_
phat_gbm <- predict(finalModel_, test_gbm, type = "response")
# AUC
pred_gbm <- prediction(phat_gbm, test_gbm$Popularity)
perf_gbm <- performance(pred_gbm, "auc")
Class_AUC_gbm[i] <- perf_gbm@y.values[[1]]
}
cat("Test AUC for GBM: ", mean(Class_AUC_gbm), "\n")Test AUC for GBM: 0.7157256
cat("95% CI for GBM: ", quantile(Class_AUC_gbm, c(0.025, 0.975)), "\n")95% CI for GBM: 0.7044311 0.7278637
plot(Class_AUC_gbm, pch = 19, col = "blue", xlab = "Iteration", ylab = "AUC", main = "GBM AUC with 95% CI")
abline(h = mean(Class_AUC_gbm), lwd = 2, lty = 2, col = "red")
abline(h = quantile(Class_AUC_gbm, 0.025), lwd = 2, lty = 2, col = "green")
abline(h = quantile(Class_AUC_gbm, 0.975), lwd = 2, lty = 2, col = "green")3.2.4 XGBoost
library(xgboost)
# Prepare the data
set.seed(100)
n <- nrow(class_lpm)
ind_xgb <- unique(sample(n, n, replace = TRUE))
class_xgb_train <- class_lpm[ind_xgb, ]
class_xgb_test <- class_lpm[-ind_xgb, ]
# One-hot coding using R's `model.matrix`
train_y_xgb <- class_xgb_train$Popularity
test_y_xgb <- class_xgb_test$Popularity
htrain_xgb <- model.matrix(~. -1, data = class_xgb_train[,-which(names(class_xgb_train) == "Popularity")])
htest_xgb <- model.matrix(~. -1, data = class_xgb_test[,-which(names(class_xgb_test) == "Popularity")])
# Convert the matrices to DMatrix objects
dtrain1 <- xgb.DMatrix(data = htrain_xgb, label = train_y_xgb)
dtest1 <- xgb.DMatrix(data = htest_xgb, label = test_y_xgb)
# Define the parameter grid
param_grid <- expand.grid(
eta = c(0.01, 0.02, 0.05, 0.1),
max_depth = c(1, 3, 5, 7),
min_child_weight = c(1, 2)
)
best_xauc <- 0
best_xparams <- list()
best_xnround <- NULL
for (i in 1:nrow(param_grid)) { # Using just 18 iterations to save time
params <- list(
booster = "gbtree",
objective = "binary:logistic",
eta = param_grid$eta[i],
max_depth = param_grid$max_depth[i],
min_child_weight = param_grid$min_child_weight[i],
eval_metric = "auc"
)
# Perform cross-validation
xgb_cv <- xgb.cv(params = params,
data = dtrain1,
nrounds = 1500,
nfold = 2,
stratified = TRUE,
maximize = FALSE,
verbose = 0) # Suppress verbose output
max_auc <- max(xgb_cv$evaluation_log$test_auc_mean)
best_xnround <- which.max(xgb_cv$evaluation_log$test_auc_mean)
if (max_auc > best_xauc) {
best_xauc <- max_auc
best_xparams <- params
best_nrounds_for_best_params <- best_xnround
}
}
auc_xgb <- c()
for (v in 1:100) {
set.seed(v)
n <- nrow(class_lpm)
ind_xgb <- unique(sample(n, n, replace = TRUE))
class_xgb_train <- class_lpm[ind_xgb, ]
class_xgb_test <- class_lpm[-ind_xgb, ]
# One-hot coding using R's `model.matrix`
train_y_xgb <- class_xgb_train$Popularity
test_y_xgb <- class_xgb_test$Popularity
htrain_xgb <- model.matrix(~. -1, data = class_xgb_train[,-which(names(class_xgb_train) == "Popularity")])
htest_xgb <- model.matrix(~. -1, data = class_xgb_test[,-which(names(class_xgb_test) == "Popularity")])
# Convert the matrices to DMatrix objects
dtrain1 <- xgb.DMatrix(data = htrain_xgb, label = train_y_xgb)
dtest1 <- xgb.DMatrix(data = htest_xgb, label = test_y_xgb)
# Train the final model with the best parameters
finalModel_xgbc <- xgb.train(data = dtrain1,
params = best_xparams,
nrounds = best_xnround,
verbose = 0)
# Predictions and AUC calculation
phat_xgb <- predict(finalModel_xgbc, dtest1)
pred_xgb <- prediction(phat_xgb, test_y_xgb)
perf_xgb <- performance(pred_xgb, "auc")
auc_xgb[v] <- perf_xgb@y.values[[1]]
}
# Print the test AUC score
cat("Test AUC for XGB: ", mean(auc_xgb), "\n")Test AUC for XGB: 0.6879219
cat("95% CI for XGB: ", quantile(auc_xgb, c(0.025, 0.975)), "\n")95% CI for XGB: 0.6766691 0.6982144
plot(auc_xgb, pch = 19, col = "blue", xlab = "Iteration", ylab = "AUC", main = "XGB AUC with 95% CI")
abline(h = mean(auc_xgb), lwd = 2, lty = 2, col = "red")
abline(h = quantile(auc_xgb, 0.025), lwd = 2, lty = 2, col = "green")
abline(h = quantile(auc_xgb, 0.975), lwd = 2, lty = 2, col = "green")3.3 Model Evaluation
# Compare the AUC of the models
cat("Benchmark Model (LPM) AUC: ", mean(AUC_lpm), "\n")Benchmark Model (LPM) AUC: 0.7015135
cat("Random Forest Model AUC: ", mean(test_AUC), "\n")Random Forest Model AUC: 0.6984085
cat("Gradient Boosting Model AUC: ", mean(Class_AUC_gbm), "\n")Gradient Boosting Model AUC: 0.7157256
cat("XGBoost Model AUC: ", mean(auc_xgb), "\n")XGBoost Model AUC: 0.6879219
# Put the AUC values in a data frame
AUC_values <- data.frame(Model = c("LPM", "Random Forest", "Gradient Boosting", "XGBoost"),
AUC = c(mean(AUC_lpm), mean(test_AUC), mean(Class_AUC_gbm), mean(auc_xgb)),
Confidence_Int = c(paste0(quantile(AUC_lpm, c(0.025)), " - ", quantile(AUC_lpm, 0.975)),
paste0(quantile(test_AUC, c(0.025)), " - ", quantile(test_AUC, 0.975)),
paste0(quantile(Class_AUC_gbm, c(0.025)), " - ", quantile(Class_AUC_gbm, 0.975)),
paste0(quantile(auc_xgb, c(0.025)), " - ", quantile(auc_xgb, 0.975))))
AUC_values Model AUC Confidence_Int
1 LPM 0.7015135 0.689230298636572 - 0.714133037129139
2 Random Forest 0.6984085 0.686274007269746 - 0.709941556885575
3 Gradient Boosting 0.7157256 0.704431075965098 - 0.7278637487919
4 XGBoost 0.6879219 0.67666913713054 - 0.698214375425108
After training and evaluating these classification models, the GBM model achieved the highest AUC score on average, indicating superior performance in predicting article popularity compared to the other models. The model’s AUC score was higher than the benchmark LPM model, Random Forest, and XGB models, suggesting that it is the most effective model for this classification task. It also gives higher AUC scores 95% of the time, indicating that it is more consistent in its predictions compared to the other models.
3.4 Confusion Matrix - GBM
# Extract the sensitivity and specificity
perf_dt <- performance(pred_gbm, "sens", "spec")
sensitivity <- perf_dt@y.values[[1]]
specificity <- perf_dt@x.values[[1]]
# Calculate Youden's Index
youden_index <- sensitivity + specificity - 1
# Find the maximum Youden's index and corresponding cutoff
max_index <- which.max(youden_index)
max_youden_index <- youden_index[max_index]
# Optimal discriminating threshold
Optimal_dt <- perf_dt@alpha.values[[1]][max_index]
Optimal_dt[1] 0.8724479
# Confusion matrix
p_gbm <- ifelse(phat_gbm > Optimal_dt, 1, 0)
cm_GBM <- table(p_gbm, test_gbm$Popularity)
cm_GBM <- cm_GBM[c(2, 1), c(2, 1)]
cm_GBM
p_gbm 1 0
1 6787 486
0 2742 835
4 Model Interpretation
4.1 Feature Importance Analysis and Visualization - GBM
# Classification GBM
library(vip)
Attaching package: 'vip'
The following object is masked from 'package:utils':
vi
summary.gbm(finalModel_) var rel.inf
kw_avg_avg kw_avg_avg 15.17892562
kw_max_avg kw_max_avg 5.79529500
self_reference_min_shares self_reference_min_shares 5.38481793
self_reference_avg_sharess self_reference_avg_sharess 5.36381210
kw_min_avg kw_min_avg 3.39520602
global_subjectivity global_subjectivity 3.33929763
n_unique_tokens n_unique_tokens 3.20369167
timedelta timedelta 3.09579504
LDA_03 LDA_03 2.93420688
n_tokens_content n_tokens_content 2.91578570
num_hrefs num_hrefs 2.89183781
kw_avg_max kw_avg_max 2.86965236
global_sentiment_polarity global_sentiment_polarity 2.84428763
LDA_04 LDA_04 2.83356197
LDA_01 LDA_01 2.75141126
average_token_length average_token_length 2.72974289
kw_avg_min kw_avg_min 2.61388524
LDA_02 LDA_02 2.40415563
LDA_00 LDA_00 2.11421977
global_rate_positive_words global_rate_positive_words 2.10412030
kw_min_max kw_min_max 2.02417696
global_rate_negative_words global_rate_negative_words 1.88344229
num_videos num_videos 1.87023873
avg_positive_polarity avg_positive_polarity 1.82754580
num_imgs num_imgs 1.72317216
avg_negative_polarity avg_negative_polarity 1.61413339
num_self_hrefs num_self_hrefs 1.24260909
n_tokens_title n_tokens_title 1.01695668
min_positive_polarity min_positive_polarity 0.99399727
title_subjectivity title_subjectivity 0.91372948
abs_title_sentiment_polarity abs_title_sentiment_polarity 0.89279519
max_negative_polarity max_negative_polarity 0.79598229
title_sentiment_polarity title_sentiment_polarity 0.76662568
data_channel_is_tech data_channel_is_tech 0.75976595
abs_title_subjectivity abs_title_subjectivity 0.74499065
data_channel_is_socmed data_channel_is_socmed 0.74217288
rate_positive_words rate_positive_words 0.66552332
min_negative_polarity min_negative_polarity 0.59989824
is_weekend is_weekend 0.44202507
weekday_is_sunday weekday_is_sunday 0.35696261
max_positive_polarity max_positive_polarity 0.33017638
weekday_is_saturday weekday_is_saturday 0.29353542
data_channel_is_entertainment data_channel_is_entertainment 0.12617255
weekday_is_monday weekday_is_monday 0.10332005
weekday_is_wednesday weekday_is_wednesday 0.10066694
data_channel_is_lifestyle data_channel_is_lifestyle 0.09342234
num_keywords num_keywords 0.08326162
data_channel_is_bus data_channel_is_bus 0.06930336
kw_min_min kw_min_min 0.05981675
weekday_is_friday weekday_is_friday 0.05119767
weekday_is_thursday weekday_is_thursday 0.04867475
data_channel_is_world data_channel_is_world 0.00000000
weekday_is_tuesday weekday_is_tuesday 0.00000000
vip::vip(finalModel_)# Regression GBM
summary.gbm(finalModel) var rel.inf
kw_avg_avg kw_avg_avg 75.0328187
kw_max_avg kw_max_avg 6.4457041
self_reference_avg_sharess self_reference_avg_sharess 4.6825647
num_imgs num_imgs 4.1994486
max_negative_polarity max_negative_polarity 2.5295397
num_hrefs num_hrefs 2.2471625
num_videos num_videos 1.5181653
self_reference_min_shares self_reference_min_shares 1.0205900
LDA_03 LDA_03 0.5467367
kw_avg_max kw_avg_max 0.3096526
global_subjectivity global_subjectivity 0.2839628
LDA_04 LDA_04 0.2650912
title_sentiment_polarity title_sentiment_polarity 0.2572489
timedelta timedelta 0.2508952
global_sentiment_polarity global_sentiment_polarity 0.2486170
n_unique_tokens n_unique_tokens 0.1618021
n_tokens_title n_tokens_title 0.0000000
n_tokens_content n_tokens_content 0.0000000
num_self_hrefs num_self_hrefs 0.0000000
average_token_length average_token_length 0.0000000
num_keywords num_keywords 0.0000000
kw_min_min kw_min_min 0.0000000
kw_avg_min kw_avg_min 0.0000000
kw_min_max kw_min_max 0.0000000
kw_min_avg kw_min_avg 0.0000000
LDA_00 LDA_00 0.0000000
LDA_01 LDA_01 0.0000000
LDA_02 LDA_02 0.0000000
global_rate_positive_words global_rate_positive_words 0.0000000
global_rate_negative_words global_rate_negative_words 0.0000000
rate_positive_words rate_positive_words 0.0000000
avg_positive_polarity avg_positive_polarity 0.0000000
min_positive_polarity min_positive_polarity 0.0000000
max_positive_polarity max_positive_polarity 0.0000000
avg_negative_polarity avg_negative_polarity 0.0000000
min_negative_polarity min_negative_polarity 0.0000000
title_subjectivity title_subjectivity 0.0000000
abs_title_subjectivity abs_title_subjectivity 0.0000000
abs_title_sentiment_polarity abs_title_sentiment_polarity 0.0000000
data_channel_is_lifestyle data_channel_is_lifestyle 0.0000000
data_channel_is_entertainment data_channel_is_entertainment 0.0000000
data_channel_is_bus data_channel_is_bus 0.0000000
data_channel_is_socmed data_channel_is_socmed 0.0000000
data_channel_is_tech data_channel_is_tech 0.0000000
data_channel_is_world data_channel_is_world 0.0000000
weekday_is_monday weekday_is_monday 0.0000000
weekday_is_tuesday weekday_is_tuesday 0.0000000
weekday_is_wednesday weekday_is_wednesday 0.0000000
weekday_is_thursday weekday_is_thursday 0.0000000
weekday_is_friday weekday_is_friday 0.0000000
weekday_is_saturday weekday_is_saturday 0.0000000
weekday_is_sunday weekday_is_sunday 0.0000000
is_weekend is_weekend 0.0000000
vip::vip(finalModel)The feature importance analysis provides insights into the relative importance of each feature in predicting the target variable (shares for regression and popularity for classification). The VIP plots show that the top 5 features contributing to the model’s predictions are kw_avg_avg, kw_max_avg, self_reference_avg_sharess, num_imgs, and max_negative_polarity for regression and kw_avg_avg, kw_max_avg, self_reference_avg_sharess, self_reference_min_shares, and kw_min_avg for classification. These features are related to the number of images, polarity, shares and keywords in the articles, indicating that these metrics play a crucial role in predicting article popularity and shares.
5 Conclusion
The analysis of the Online News Popularity dataset aimed to predict article popularity and shares using regression and classification models. The project involved data preprocessing, exploratory data analysis, feature engineering, model development, and evaluation. The key findings and insights from the analysis are summarized below:
Regression Models
- The Gradient Boosting Machine (GBM) model outperformed the other regression models, achieving the lowest Root Mean Squared Prediction Error (RMSPE) and the highest predictive performance.
- The Random Forest (RF) model also performed well, with competitive RMSE values compared to the GBM model.
Classification Models
- The GBM model achieved the highest Area Under the Receiver Operating Characteristic Curve (AUC) score on average, indicating superior performance in predicting article popularity compared to the other models.
- The XGBoost model also performed well, with a competitive AUC score, but slightly lower than the GBM model.
Feature Importance Analysis
- The feature importance analysis revealed that metrics related to keywords, shares, polarity, and the number of images in the articles were crucial in predicting article popularity and shares.
Actionable Insights
- These insights can help content creators understand the factors that influence article popularity and tailor their content strategies to improve engagement and reach. They can focus on optimizing articles with specific keywords, images, and polarity to increase their popularity and shares.
Challenges and Future Research
- Challenges encountered during the project included handling missing values, excessive computation time for feature selection, and model tuning. Future research could focus on addressing these challenges by exploring more efficient feature selection techniques, parallel processing, and automated hyper-parameter tuning methods.
- The project’s outcomes demonstrate the value of machine learning in predicting article popularity and informing content strategies, providing actionable insights that can be utilized to optimize article and increase engagement.